R Tutorial
An introduction to R
Introduction
This tutorial is will introduce the reader to , a free, open-source statistical computing environment often used with RStudio, a integrated development environment for .
R Project Logo
Download
- Download at https://www.r-project.org/
- Download
RStudioat https://rstudio.com/products/rstudio/download/
Calculator
can be used as a super awesome calculator
# 5 + 3 = 8
5 + 3 ## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2) ## [1] 8
# 2 * 2 * 2 = 8
2^3 ## [1] 8
# 8 * 8 = 64
sqrt(64) ## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000) ## [1] 8
Functions
has many useful built in functions
1:10## [1] 1 2 3 4 5 6 7 8 9 10
as.character(1:10)## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
rep(1:2, times = 5)## [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)## [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)## [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)## [1] 5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)## [1] 5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")## [1] "1-20" "2-21" "3-22" "4-23" "5-24" "6-25" "7-26" "8-27" "9-28" "10-29" "1-30"
paste(1:10, collapse = "-")## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
min(1:10)## [1] 1
max(1:10)## [1] 10
range(1:10)## [1] 1 10
mean(1:10)## [1] 5.5
sd(1:10)## [1] 3.02765
Custom Functions
Users can also create their own functions
customFunction1 <- function(x, y) {
z <- 100 * x / (x + y)
paste(z, "%")
}
customFunction1(x = 10, y = 90)## [1] "10 %"
customFunction2 <- function(x) {
mymin <- mean(x - sd(x))
mymax <- mean(x) + sd(x)
print(paste("Min =", mymin))
print(paste("Max =", mymax))
}
customFunction2(x = 1:10)## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"
for loops and if else
statements
xx <- NULL #creates and empty object
for(i in 1:10) {
xx[i] <- i*3
}
xx## [1] 3 6 9 12 15 18 21 24 27 30
xx %% 2 #gives the remainder when divided by 2## [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
if((xx[i] %% 2) == 0) {
print(paste(xx[i],"is Even"))
} else {
print(paste(xx[i],"is Odd"))
}
}## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")## [1] "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))## [1] "3 is Odd" "6 is Even" "9 is Odd" "12 is Even" "15 is Odd" "18 is Even" "21 is Odd" "24 is Even" "27 is Odd" "30 is Even"
Objects
Information can be stored in user defined objects, in multiple forms:
c(): a string of valuesmatrix(): a two dimensional matrix in one formatdata.frame(): a two dimensional matrix where each column can be a different formatlist():
A string…
xc <- 1:10
xc## [1] 1 2 3 4 5 6 7 8 9 10
xc <- c(1,2,3,4,5,6,7,8,9,10)
xc## [1] 1 2 3 4 5 6 7 8 9 10
A matrix…
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 100
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
A data frame…
xd <- data.frame(
x1 = c("aa","bb","cc","dd","ee",
"ff","gg","hh","ii","jj"),
x2 = 1:10,
x3 = c(1,1,1,1,1,2,2,2,3,3),
x4 = rep(c(1,2), times = 5),
x5 = rep(1:5, times = 2),
x6 = rep(1:5, each = 2),
x7 = seq(5, 50, by = 5),
x8 = log10(1:10),
x9 = (1:10)^3,
x10 = c(T,T,T,F,F,T,T,F,F,F)
)
xd## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
A list…
xl <- list(xc, xm, xd)
xl[[1]]## [1] 1 2 3 4 5 6 7 8 9 10
xl[[2]]## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
xl[[3]]## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
Selecting Data
xc[5] # 5th element in xc## [1] 5
xd$x3[5] # 5th element in col "x3"## [1] 1
xd[5,"x3"] # row 5, col "x3"## [1] 1
xd$x3 # all of col "x3"## [1] 1 1 1 1 1 2 2 2 3 3
xd[,"x3"] # all rows, col "x3"## [1] 1 1 1 1 1 2 2 2 3 3
xd[3,] # row 3, all cols## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
xd[c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5"## x4 x5
## 2 2 2
## 4 2 4
xl[[3]]$x1 # 3rd object in the list, col "x1## [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"
regexpr
xx <- data.frame(Name = c("Item 1 (detail 1)",
"Item 20 (detail 20)",
"Item 300 (detail 300)"),
Item = NA,
Detail = NA)
xx$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx## Name Item Detail
## 1 Item 1 (detail 1) Item 1 detail 1
## 2 Item 20 (detail 20) Item 20 detail 20
## 3 Item 300 (detail 300) Item 300 detail 300
Data Formats
Data can also be saved in many formats:
- numeric
- integer
- character
- factor
- logical
xd$x3 <- as.character(xd$x3)
xd$x3## [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
xd$x3 <- as.numeric(xd$x3)
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
xd$x3 <- as.factor(xd$x3)
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
xd$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
xd$x10## [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0## [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)## [1] 5
Internal structure of an object can be checked with
str()
str(xc) # c()## num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()## int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()## 'data.frame': 10 obs. of 10 variables:
## $ x1 : chr "aa" "bb" "cc" "dd" ...
## $ x2 : int 1 2 3 4 5 6 7 8 9 10
## $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
## $ x4 : num 1 2 1 2 1 2 1 2 1 2
## $ x5 : int 1 2 3 4 5 1 2 3 4 5
## $ x6 : int 1 1 2 2 3 3 4 4 5 5
## $ x7 : num 5 10 15 20 25 30 35 40 45 50
## $ x8 : num 0 0.301 0.477 0.602 0.699 ...
## $ x9 : num 1 8 27 64 125 216 343 512 729 1000
## $ x10: logi TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()## List of 3
## $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
## $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
## $ :'data.frame': 10 obs. of 10 variables:
## ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
## ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
## ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
## ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
## ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
## ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
## ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
## ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
## ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
## ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...
Packages
Additional libraries can be installed and loaded for use.
install.packages("scales")library(scales)
xx <- data.frame(Values = 1:10)
xx$Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx## Values Rescaled
## 1 1 1.000000
## 2 2 4.222222
## 3 3 7.444444
## 4 4 10.666667
## 5 5 13.888889
## 6 6 17.111111
## 7 7 20.333333
## 8 8 23.555556
## 9 9 26.777778
## 10 10 30.000000
libraries can also be used without having to load them
scales::rescale(1:10, to = c(1,30))## [1] 1.000000 4.222222 7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000
Data Wrangling
R for Data Science - https://r4ds.had.co.nz/
xx <- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
Data1 = 1:10,
Data2 = seq(10, 100, by = 10))
xx$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 X 6 60 66 6000
## 7 X 7 70 77 7000
## 8 X 8 80 88 8000
## 9 Y 9 90 99 9000
## 10 Y 10 100 110 10000
xx$Data1 < 5 # which are less than 5## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
xx[xx$Data1 < 5,]## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx[xx$Group == "X", c("Group","Data2","NewData1")]## Group Data2 NewData1
## 1 X 10 11
## 2 X 20 22
## 6 X 60 66
## 7 X 70 77
## 8 X 80 88
Data wrangling with tidyverse and pipes
(%>%)
library(tidyverse) # install.packages("tidyverse")
xx <- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
mutate(Data1 = 1:10,
Data2 = seq(10, 100, by = 10),
NewData1 = Data1 + Data2,
NewData2 = Data1 * 1000)
xx## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 Y 6 60 66 6000
## 7 Y 7 70 77 7000
## 8 X 8 80 88 8000
## 9 X 9 90 99 9000
## 10 X 10 100 110 10000
filter(xx, Data1 < 5)## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx %>% filter(Data1 < 5)## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx %>% filter(Group == "X") %>%
select(Group, NewColName=Data2, NewData1)## Group NewColName NewData1
## 1 X 10 11
## 2 X 20 22
## 3 X 80 88
## 4 X 90 99
## 5 X 100 110
xs <- xx %>%
group_by(Group) %>%
summarise(Data2_mean = mean(Data2),
Data2_sd = sd(Data2),
NewData2_mean = mean(NewData2),
NewData2_sd = sd(NewData2))
xs## # A tibble: 2 × 5
## Group Data2_mean Data2_sd NewData2_mean NewData2_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 X 60 41.8 6000 4183.
## 2 Y 50 15.8 5000 1581.
xx %>% left_join(xs, by = "Group")## Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1 X 1 10 11 1000 60 41.83300 6000 4183.300
## 2 X 2 20 22 2000 60 41.83300 6000 4183.300
## 3 Y 3 30 33 3000 50 15.81139 5000 1581.139
## 4 Y 4 40 44 4000 50 15.81139 5000 1581.139
## 5 Y 5 50 55 5000 50 15.81139 5000 1581.139
## 6 Y 6 60 66 6000 50 15.81139 5000 1581.139
## 7 Y 7 70 77 7000 50 15.81139 5000 1581.139
## 8 X 8 80 88 8000 60 41.83300 6000 4183.300
## 9 X 9 90 99 9000 60 41.83300 6000 4183.300
## 10 X 10 100 110 10000 60 41.83300 6000 4183.300
Read/Write data
xx <- read.csv("data_r_tutorial.csv")
write.csv(xx, "data_r_tutorial.csv", row.names = F)For excel sheets, the package readxl can be used to read
in sheets of data.
library(readxl) # install.packages("readxl")
xx <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data")Tidy Data
- Tutorial 1 - https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html
- Tutorial 2 - https://r4ds.had.co.nz/tidy-data.html
yy <- xx %>%
group_by(Name, Location) %>%
summarise(Mean_DTF = round(mean(DTF),1)) %>%
arrange(Location)
yy## # A tibble: 9 × 3
## # Groups: Name [3]
## Name Location Mean_DTF
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
yy <- yy %>% spread(key = Location, value = Mean_DTF)
yy## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
## <chr> <dbl> <dbl> <dbl>
## 1 CDC Maxim AGL 86.7 134. 52.5
## 2 ILL 618 AGL 79.3 138. 47
## 3 Laird AGL 76.8 137. 56.8
yy <- yy %>% gather(key = TraitName, value = Value, 2:4)
yy## # A tibble: 9 × 3
## # Groups: Name [3]
## Name TraitName Value
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
yy <- yy %>% spread(key = Name, value = Value)
yy## # A tibble: 3 × 4
## TraitName `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
## <chr> <dbl> <dbl> <dbl>
## 1 Jessore, Bangladesh 86.7 79.3 76.8
## 2 Metaponto, Italy 134. 138. 137.
## 3 Saskatoon, Canada 52.5 47 56.8
Base Plotting
We will start with some basic plotting using the base function
plot()
# A basic scatter plot
plot(x = xd$x8, y = xd$x9)# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)# Adjust plot type
plot(x = xd$x8, y = xd$x9, type = "line")# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")Now lets create some random and normally distributed data to make some more complicated plots
# 100 random uniformly distributed numbers ranging from 0 - 100
ru <- runif(100, min = 0, max = 100)
ru## [1] 96.0909934 36.3319100 57.0543919 63.4470600 48.8769224 30.9849352 43.2130736 18.0674730 77.9270811 65.1550153 5.4954106 23.8779564 53.5706821
## [14] 35.4832303 21.4402204 39.3435647 34.5390148 70.6379455 56.6302475 23.5936907 82.3632459 38.5713657 93.4011715 23.1840488 19.8945236 77.3143760
## [27] 15.6624677 45.9533963 72.3855423 39.0371752 26.0009424 42.1324556 19.5740572 12.8479314 70.9721536 0.7501338 30.8773174 82.3849217 7.7080700
## [40] 48.2284738 24.9944032 24.4474838 37.2055267 99.6818187 57.0328898 49.5030206 93.8633366 4.2979580 94.5434677 14.3226101 53.9331771 34.1972585
## [53] 45.7525231 99.1295152 10.9430538 64.5497798 28.6946935 84.4600444 33.4347785 39.5425685 70.7851911 43.1022465 18.1259082 66.3035607 70.5554920
## [66] 29.0203822 48.6727686 70.6645737 94.9769992 25.4399703 7.1961194 46.4523844 51.1823816 61.2966206 6.1180875 41.1020698 66.2412272 18.2738848
## [79] 10.9573351 78.1315940 71.7686910 19.7264437 54.3229438 19.5455092 25.8790494 44.6906895 68.1190235 46.9439889 72.0533598 78.5771140 6.9147502
## [92] 84.9760539 91.1819080 42.5745242 19.5380920 47.1875864 84.3305283 97.8478734 54.2894681 28.0528330
plot(x = ru)order(ru)## [1] 36 48 11 75 91 71 39 55 79 34 50 27 8 63 78 95 84 33 82 25 15 24 20 12 42 41 70 85 31 100 57 66 37 6 59 52 17
## [38] 14 2 43 22 30 16 60 76 32 94 62 7 86 53 28 72 88 96 40 67 5 46 73 13 51 99 83 19 45 3 74 4 56 10 77 64 87
## [75] 65 18 68 61 35 81 89 29 26 9 80 90 21 38 97 58 92 93 23 47 49 69 1 98 54 44
ru<- ru[order(ru)]
ru## [1] 0.7501338 4.2979580 5.4954106 6.1180875 6.9147502 7.1961194 7.7080700 10.9430538 10.9573351 12.8479314 14.3226101 15.6624677 18.0674730
## [14] 18.1259082 18.2738848 19.5380920 19.5455092 19.5740572 19.7264437 19.8945236 21.4402204 23.1840488 23.5936907 23.8779564 24.4474838 24.9944032
## [27] 25.4399703 25.8790494 26.0009424 28.0528330 28.6946935 29.0203822 30.8773174 30.9849352 33.4347785 34.1972585 34.5390148 35.4832303 36.3319100
## [40] 37.2055267 38.5713657 39.0371752 39.3435647 39.5425685 41.1020698 42.1324556 42.5745242 43.1022465 43.2130736 44.6906895 45.7525231 45.9533963
## [53] 46.4523844 46.9439889 47.1875864 48.2284738 48.6727686 48.8769224 49.5030206 51.1823816 53.5706821 53.9331771 54.2894681 54.3229438 56.6302475
## [66] 57.0328898 57.0543919 61.2966206 63.4470600 64.5497798 65.1550153 66.2412272 66.3035607 68.1190235 70.5554920 70.6379455 70.6645737 70.7851911
## [79] 70.9721536 71.7686910 72.0533598 72.3855423 77.3143760 77.9270811 78.1315940 78.5771140 82.3632459 82.3849217 84.3305283 84.4600444 84.9760539
## [92] 91.1819080 93.4011715 93.8633366 94.5434677 94.9769992 96.0909934 97.8478734 99.1295152 99.6818187
plot(x = ru)# 100 normally distributed numbers with a mean of 50 and sd of 10
nd <- rnorm(100, mean = 50, sd = 10)
nd## [1] 45.27419 21.24775 60.03604 41.24859 49.00135 48.89081 68.16194 36.16961 32.33667 39.28913 80.22331 48.85236 60.73534 28.79508 47.00713 51.48241
## [17] 41.08321 61.89298 59.54496 40.24672 37.94363 52.45350 47.86144 61.83850 58.41759 50.80677 55.75651 42.71645 34.88947 36.72187 58.40154 58.10106
## [33] 36.74631 29.21919 57.78840 42.10098 49.17489 33.80226 62.58843 48.12089 49.89343 40.82116 54.26381 46.58705 40.54704 47.86578 45.11443 44.44583
## [49] 44.50966 63.99424 42.21372 45.50589 52.80402 53.53540 62.26164 28.97538 39.00370 53.53126 56.66642 66.28020 35.75612 37.50349 32.43241 28.74805
## [65] 45.61356 62.61303 33.18553 61.27382 51.70364 45.79045 53.07563 45.76097 62.69816 60.19736 65.42558 38.20679 47.12459 45.28588 42.17983 43.66357
## [81] 52.90251 46.89385 41.15676 51.48184 40.86169 41.31465 55.36368 45.62941 59.30061 35.52404 49.10332 57.43906 52.46387 45.43556 49.36732 54.29003
## [97] 30.66065 52.37781 60.98123 59.24251
nd <- nd[order(nd)]
nd## [1] 21.24775 28.74805 28.79508 28.97538 29.21919 30.66065 32.33667 32.43241 33.18553 33.80226 34.88947 35.52404 35.75612 36.16961 36.72187 36.74631
## [17] 37.50349 37.94363 38.20679 39.00370 39.28913 40.24672 40.54704 40.82116 40.86169 41.08321 41.15676 41.24859 41.31465 42.10098 42.17983 42.21372
## [33] 42.71645 43.66357 44.44583 44.50966 45.11443 45.27419 45.28588 45.43556 45.50589 45.61356 45.62941 45.76097 45.79045 46.58705 46.89385 47.00713
## [49] 47.12459 47.86144 47.86578 48.12089 48.85236 48.89081 49.00135 49.10332 49.17489 49.36732 49.89343 50.80677 51.48184 51.48241 51.70364 52.37781
## [65] 52.45350 52.46387 52.80402 52.90251 53.07563 53.53126 53.53540 54.26381 54.29003 55.36368 55.75651 56.66642 57.43906 57.78840 58.10106 58.40154
## [81] 58.41759 59.24251 59.30061 59.54496 60.03604 60.19736 60.73534 60.98123 61.27382 61.83850 61.89298 62.26164 62.58843 62.61303 62.69816 63.99424
## [97] 65.42558 66.28020 68.16194 80.22331
plot(x = nd)hist(x = nd)hist(nd, breaks = 20, col = "darkgreen")plot(x = density(nd))boxplot(x = nd)boxplot(x = nd, horizontal = T)ggplot2
Lets be honest, the base plots are ugly! The ggplot2
package gives the user to create a better, more visually appealing
plots. Additional packages such as ggbeeswarm and
ggrepel also contain useful functions to add to the
functionality of ggplot2.
- ggplot2 - https://ggplot2.tidyverse.org/
- Tutorial 1 - http://r-statistics.co/ggplot2-Tutorial-With-R.html
- Tutorial 2 - https://www.statsandr.com/blog/graphics-in-r-with-ggplot2/
- The R Graph Gallery - https://www.r-graph-gallery.com/ggplot2-package.html
library(ggplot2)
mp <- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point()mp + geom_point(aes(color = x3, shape = x3), size = 4)mp + geom_line(size = 2)mp + geom_line(aes(color = x3), size = 2)mp + geom_smooth(method = "loess")mp + geom_smooth(method = "lm")xx <- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
rnorm(50, mean = 60, sd = 5)),
group = factor(rep(1:2, each = 50)),
label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
mp <- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black")mp + geom_histogram(color = "black", position = "dodge")mp1 <- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1mp + geom_density(alpha = 0.5)mp <- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black")mp + geom_boxplot() + geom_point()mp + geom_violin() + geom_boxplot(width = 0.1, fill = "white")library(ggbeeswarm)
mp + geom_quasirandom()mp + geom_quasirandom(aes(shape = group))mp2 <- mp + geom_violin() +
geom_boxplot(width = 0.1, fill = "white") +
geom_beeswarm(alpha = 0.5)
library(ggrepel)
mp2 + geom_text_repel(aes(label = label), nudge_x = 0.4)library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
common.legend = T, legend = "bottom")Statistics
- Handbook of Biological Statistics - http://biostathandbook.com/
- R Companion for ^ - https://rcompanion.org/rcompanion/a_02.html
# Prep data
lev_Loc <- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Name <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
dd <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
mutate(Location = factor(Location, levels = lev_Loc),
Name = factor(Name, levels = lev_Name))
xx <- dd %>%
group_by(Name, Location) %>%
summarise(Mean_DTF = mean(DTF))
xx %>% spread(Location, Mean_DTF)## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
## <fct> <dbl> <dbl> <dbl>
## 1 ILL 618 AGL 47 79.3 138.
## 2 CDC Maxim AGL 52.5 86.7 134.
## 3 Laird AGL 56.8 76.8 137.
# Plot
mp1 <- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
mp2 <- ggplot(xx, aes(x = Location, y = Mean_DTF,
color = Name, group = Name, shape = Name)) +
geom_point(size = 2.5, alpha = 0.7) +
geom_line(size = 1, alpha = 0.7) +
theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.
summary(aov(DTF ~ Name * Location, data = dd))## Df Sum Sq Mean Sq F value Pr(>F)
## Name 2 88 44 3.476 0.0395 *
## Location 2 65863 32931 2598.336 < 2e-16 ***
## Name:Location 4 560 140 11.044 2.52e-06 ***
## Residuals 45 570 13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.
If we only have two things to compare, we could do a t-test.
xx <- dd %>%
filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)##
## Welch Two Sample t-test
##
## data: xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -32.18265 -25.48402
## sample estimates:
## mean of x mean of y
## 52.11111 80.94444
DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.
xx <- dd %>%
filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
Location == "Metaponto, Italy") %>%
spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)##
## Welch Two Sample t-test
##
## data: xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.059739 7.059739
## sample estimates:
## mean of x mean of y
## 137.8333 136.8333
DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.
pch Plot
xx <- data.frame(x = rep(1:6, times = 5, length.out = 26),
y = rep(5:1, each = 6, length.out = 26),
pch = 0:25)
mp <- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
geom_point(color = "darkred", fill = "darkblue", size = 5) +
geom_text(aes(label = pch), nudge_x = -0.25) +
scale_shape_manual(values = xx$pch) +
scale_x_continuous(breaks = 6:1) +
scale_y_continuous(breaks = 6:1) +
theme_void() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Plot symbols in R (pch)",
subtitle = "color = \"darkred\", fill = \"darkblue\"",
x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")R Markdown
Tutorials on how to create an R markdown document like this one can be found here: